# From end of 07-out-of-sample.R

# Any particular reason why we are under-predicting? --------------------------
# We are underpredicting by about 2 pct on average (in 2020)
nerc_fuel_gen_dt[,.(
  diff = (sum(net_generation_mwh)-sum(fit_net_generation_mwh))/sum(net_generation_mwh))
]
# Some variance, but generally true across all regions 
nerc_fuel_gen_dt[,.(
  mwh_actual = sum(net_generation_mwh),
  mwh_fit = sum(fit_net_generation_mwh)),
  keyby = .(nerc_adj)
][,.(
  nerc_adj,
  diff = (mwh_actual - mwh_fit)/mwh_actual
)][order(diff)]
# Fuel categories do seem to be playing an important role
# We are over-predicting coal and under-predicting natural gas
nerc_fuel_gen_dt[,.(
  mwh_actual = sum(net_generation_mwh),
  mwh_fit = sum(fit_net_generation_mwh)),
  keyby = .(fuel_category)
][,.(
  fuel_category,
  diff = (mwh_actual - mwh_fit)/mwh_actual
)][order(diff)]
nerc_fuel_gen_dt[,.(
  mwh_actual = sum(net_generation_mwh),
  mwh_fit = sum(fit_net_generation_mwh)),
  keyby = .(nerc_adj, fuel_category)
][,.(
  nerc_adj, 
  fuel_category,
  diff = mwh_actual - mwh_fit
)][order(diff)]

# Can we check if this is also true in-sample? 
# from 06-visualize...
setnames(
  elec_gen_dt,
  old = c('tot_net_generation_mwh','tot_fit_net_generation_mwh'),
  new = c('net_generation_mwh','fit_net_generation_mwh')
)
elec_gen_dt[,.(
  cat = 'total',
  diff = (sum(net_generation_mwh)-sum(fit_net_generation_mwh))/sum(net_generation_mwh))
]
# Regions
elec_gen_dt[,.(
  mwh_actual = sum(net_generation_mwh),
  mwh_fit = sum(fit_net_generation_mwh)),
  keyby = .(nerc_adj)
][,.(
  nerc_adj,
  diff = (mwh_actual - mwh_fit)/mwh_actual
)][order(diff)]
# Fuel categories
elec_gen_dt[,.(
  mwh_actual = sum(net_generation_mwh),
  mwh_fit = sum(fit_net_generation_mwh)),
  keyby = .(fuel_category)
][,.(
  fuel_category,
  diff = (mwh_actual - mwh_fit)/mwh_actual
)][order(diff)]



# So we are also under-predicting in 2019 data, but point on fuel types remains 
# Getting file paths for all plants in region
plant_gen_fp = list.files(
  here('Data/electricity-generation/plant-model-fit/plant-gen-dt'),
  full.names = TRUE
)
nerc_adj_in = 'CAL'
plant_gen_fp = list.files(
  here('Data/electricity-generation/plant-model-fit/plant-gen-dt'),
  full.names = TRUE
)
nerc_plant_gen_fp = 
  plant_gen_fp[
    str_extract(plant_gen_fp, '(?<=-)\\d*(?=\\.fst)') |> as.integer()
    %in% plant_info_dt[nerc_adj == nerc_adj_in]$plant_id_eia
  ]
# Summarizing generation by time, ic, nerc, fuel cat
cal_elec_gen_dt = 
  map_dfr(
    nerc_plant_gen_fp, 
    read.fst,
    as.data.table = TRUE
  ) |>
  merge(
    plant_info_dt, 
    by = 'plant_id_eia'
  )
# Verify we are getting the same under-prediction: 
cal_elec_gen_dt[,.(
  mhw_actual = sum(net_generation_mwh),
  mhw_fit = sum(fit_net_generation_mwh), 
  diff = (sum(net_generation_mwh) - sum(fit_net_generation_mwh))/sum(net_generation_mwh)
)]
# Are there particular plants, or is it across all plants? 
cal_elec_gen_dt[,
.(
  mhw_actual = sum(net_generation_mwh),
  mhw_fit = sum(fit_net_generation_mwh), 
  diff = (sum(net_generation_mwh) - sum(fit_net_generation_mwh))/sum(net_generation_mwh)
), keyby = .(plant_id_eia, fuel_category)
] |>
ggplot(
  aes(x = diff)) + 
  geom_histogram(bins = 50)

cal_elec_gen_dt[,.(
  count_above_npc = sum(net_generation_mwh > namepcap),
  count_below_0 = sum(net_generation_mwh < 0),
  fit_count_above_npc = sum(fit_net_generation_mwh > namepcap),
  fit_count_below_0 = sum(fit_net_generation_mwh < 0)
)]

cal_elec_gen_dt[,
  net_generation_mwh_nonneg := fifelse(net_generation_mwh < 0, 0, net_generation_mwh)
]

cal_elec_gen_dt[,.(
  mhw_actual = sum(net_generation_mwh),
  mhw_actual_nonneg = sum(net_generation_mwh_nonneg),
  mhw_fit = sum(fit_net_generation_mwh), 
  diff = (sum(net_generation_mwh) - sum(fit_net_generation_mwh))/sum(net_generation_mwh), 
  diff_noneg = (sum(net_generation_mwh_nonneg) - sum(fit_net_generation_mwh))/sum(net_generation_mwh_nonneg)
)]

cal_elec_gen_dt[,
  mean(net_generation_mwh < 0),
  keyby = .(plant_id_eia, fuel_category)
][V1 > 0]

# So what is the deal? 
cal_elec_gen_dt[,.(
  mwh_actual_raw = sum(net_generation_mwh_raw),
  mwh_actual = sum(net_generation_mwh),
  mwh_fit = sum(fit_net_generation_mwh), 
  mwh_fit_raw = sum(fit_net_generation_mwh_raw), 
  diff = (sum(net_generation_mwh) - sum(fit_net_generation_mwh))/sum(net_generation_mwh), 
  diff_raw = (sum(net_generation_mwh) - sum(fit_net_generation_mwh_raw))/sum(net_generation_mwh)
)]

# Is it at low or high values? 
cal_elec_gen_dt[,
.(
  mhw_actual = sum(net_generation_mwh),
  mhw_fit = sum(fit_net_generation_mwh), 
  diff = (sum(net_generation_mwh) - sum(fit_net_generation_mwh))/sum(net_generation_mwh)
), keyby = .(plant_id_eia, fuel_category)
][diff < 0.001 & diff > -0.001]

ggplot(
  cal_elec_gen_dt[
    plant_id_eia %in% c(55077, 55182, 55393,55518,55748, 55985, 56298,56026,56476)
  ], 
  aes(x = net_generation_mwh, y = fit_net_generation_mwh)
) + 
geom_point(alpha = 0.2) + 
geom_smooth() + 
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + 
theme_minimal() + 
facet_wrap(~plant_id_eia, scales = 'free')

ggplot(
  cal_elec_gen_dt[
    plant_id_eia %in% c(55184, 52107, 52085,55184,56051, 57043, 57482,58168,58302)
  ], 
  aes(x = net_generation_mwh, y = fit_net_generation_mwh)
) + 
geom_point(alpha = 0.2) + 
geom_smooth() + 
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + 
theme_minimal() + 
facet_wrap(~plant_id_eia, scales = 'free')


# Can we look at the residuals of the good and bad ones? 
cal_elec_gen_dt[,
  perf := fcase(
    plant_id_eia %in% c(55077, 55182, 55393,55518,55748, 55985, 56298,56026,56476), 'bad',
    plant_id_eia %in% c(55184, 52107, 52085,55184,56051, 57043, 57482,58168,58302), 'good'
  )
]
ggplot(
  cal_elec_gen_dt[!is.na(perf)], 
  aes(x = net_generation_mwh - fit_net_generation_mwh, color= perf, fill = perf)
) + 
geom_histogram(bins = 100) + 
theme_minimal() + 
facet_wrap(~plant_id_eia, scales = 'free')

ggplot(
  cal_elec_gen_dt[!is.na(perf)], 
  aes(x = net_generation_mwh, color= perf, fill = perf)
) + 
geom_histogram(bins = 100) + 
theme_minimal() + 
facet_wrap(~plant_id_eia, scales = 'free')

# Is underprediction correlated with number of hours == 0 
plant_annual_dt = 
  map_dfr(
    plant_gen_fp, 
    \(x){
      read.fst(path = x, as.data.table = TRUE)[,.(
        mwh_actual = sum(net_generation_mwh),
        mwh_fit = sum(fit_net_generation_mwh), 
        hour_count = .N,
        pct_hours_zero = mean(net_generation_mwh == 0),
        pct_fit_hours_zero = mean(fit_net_generation_mwh == 0)), 
        keyby = plant_id_eia
      ]
    }
  )[,":="(
      diff = mwh_actual - mwh_fit,
      diff_pct = (mwh_actual - mwh_fit)/mwh_actual
  )] |> 
  merge(
    plant_info_dt, 
    by = 'plant_id_eia'
  )

ggplot(data = plant_annual_dt[abs(diff_pct) < 0.5], aes(x = pct_hours_zero, y = diff_pct)) + 
geom_point(aes(color = fuel_category)) + 
#geom_smooth(method = 'lm') + 
geom_hline(yintercept = 0, linetype = 'solid') + 
scale_x_continuous(
  name = 'Percent of hours Zero', 
  labels = scales::label_percent()
) +
scale_y_continuous(
  name = 'Aggregate Prediction Error (%)',
  labels = scales::label_percent()
) + 
scale_color_brewer(
  name = 'Fuel Type',
  palette = 'Dark2'
) 
theme_minimal() 


ggplot(data = plant_annual_dt[abs(diff_pct)<0.5], aes(x = pct_hours_zero, y = pct_fit_hours_zero)) + 
#geom_smooth(method = 'lm') + + 
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') +
geom_point(aes(color = diff_pct)) + 
geom_hline(yintercept = 0, linetype = 'solid') + 
scale_x_continuous(
  name = 'Actual Percent of hours Zero', 
  labels = scales::label_percent()
) +
scale_y_continuous(
  name = 'Fit Percent of hours Zero (%)',
  labels = scales::label_percent()
) + 
scale_color_viridis_c(
  name = 'Error pct',
  option = 'magma'
) + 
theme_minimal() 


# Ok maybe I should be looking at these bad plants 
bad_plants = 
  cal_elec_gen_dt[,.(
    mwh_actual = sum(net_generation_mwh),
    mwh_fit = sum(fit_net_generation_mwh), 
    diff = (sum(net_generation_mwh) - sum(fit_net_generation_mwh)),
    diff_pct = (sum(net_generation_mwh) - sum(fit_net_generation_mwh))/sum(net_generation_mwh),
    hours_zero = sum(net_generation_mwh == 0)), 
    keyby = .(plant_id_eia, fuel_category)
  ][diff > 5e4]$plant_id_eia
# Distribution of actual production
ggplot(
  cal_elec_gen_dt[plant_id_eia %in% bad_plants], 
  aes(x = net_generation_mwh)
) + 
geom_histogram(bins = 100) + 
theme_minimal() + 
facet_wrap(~plant_id_eia, scales = 'free')
ggplot(
  cal_elec_gen_dt[plant_id_eia %in% bad_plants], 
  aes(x = fit_net_generation_mwh)
) + 
geom_histogram(bins = 100) + 
theme_minimal() + 
facet_wrap(~plant_id_eia, scales = 'free')
ggplot(
  cal_elec_gen_dt[plant_id_eia %in% bad_plants], 
  aes(x = net_generation_mwh - fit_net_generation_mwh)
) + 
geom_histogram(bins = 100) + 
theme_minimal() + 
facet_wrap(~plant_id_eia, scales = 'free')



ggplot(
  cal_elec_gen_dt[plant_id_eia %in% bad_plants], 
  aes(x = net_generation_mwh, y = fit_net_generation_mwh)
) + 
geom_point(alpha = 0.2) + 
geom_smooth() + 
geom_abline(slope = 1, intercept = 0, linetype = 'dashed') + 
theme_minimal() + 
facet_wrap(~plant_id_eia, scales = 'free')

cal_elec_gen_dt[,.(
  mwh_actual = sum(net_generation_mwh),
  mwh_fit = sum(fit_net_generation_mwh), 
  diff = (sum(net_generation_mwh) - sum(fit_net_generation_mwh)),
  diff_pct = (sum(net_generation_mwh) - sum(fit_net_generation_mwh))/sum(net_generation_mwh),
  hours_zero = sum(net_generation_mwh == 0),
  pct_hours_zero = mean(net_generation_mwh == 0)), 
  keyby = .(fuel_category, plant_id_eia %in% bad_plants)
]

